tutorials-and-examples/how-tos/Automation Setup - Configure Azure Machine Learning Pipelines.ipynb (455 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "source": [ "# Automation Setup - Configure Azure Machine Learning Pipelines\r\n", "\r\n", "__Notebook Version:__ 1.0<br>\r\n", "__Python Version:__ Python 3.8 - AzureML<br>\r\n", "__Required Packages:__ No<br>\r\n", "__Platforms Supported:__ Azure Machine Learning Notebooks\r\n", " \r\n", "__Data Source Required:__ No \r\n", " \r\n", "### Description\r\n", "This is the second notebook of series for setting up Microsoft Sentinel notebook automation platform based on Azure Machine Learning Pipelines.</br>\r\n", "Before starting this notebook, you should have a notebook to be executed automatically ready. </br>\r\n", "This notebook provides step-by-step instructions to create Azure Machine Learning Pipeline, publish it, and schedule to run the pipeline to execute the targeted notebook.</br>\r\n", "\r\n", "*** Please run the cells sequentially to avoid errors. Please do not use \"run all cells\". *** <br>\r\n", "\r\n", "## Table of Contents\r\n", "1. Warm-up\r\n", "2. Authentication to Azure Resources\r\n", "3. Azure Machine Learning Pipleline" ], "metadata": { "nteract": { "transient": { "deleting": false } } } }, { "cell_type": "markdown", "source": [ "## 1. Warm-up" ], "metadata": { "nteract": { "transient": { "deleting": false } } } }, { "cell_type": "code", "source": [ "# Azure Machine Learning and Pipeline SDK-specific imports\r\n", "# azureml\r\n", "import azureml.core\r\n", "from azureml.core import Workspace, Experiment\r\n", "from azureml.core.datastore import Datastore\r\n", "from azureml.core.runconfig import RunConfiguration\r\n", "from azureml.core.conda_dependencies import CondaDependencies\r\n", "from azureml.contrib.notebook import NotebookRunConfig, AzureMLNotebookHandler\r\n", "from azureml.pipeline.core import Pipeline\r\n", "from azureml.pipeline.core import PipelineData\r\n", "from azureml.contrib.notebook import NotebookRunnerStep\r\n", "from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule\r\n", "\r\n", "# azure common/core\r\n", "from azure.common.credentials import get_azure_cli_credentials\r\n", "from azure.mgmt.resource import ResourceManagementClient\r\n", "\r\n", "# Python/ipython\r\n", "import json\r\n", "from datetime import datetime\r\n", "from IPython.display import display, HTML, Markdown\r\n", "\r\n", "# Check core SDK version number\r\n", "print(\"SDK version:\", azureml.core.VERSION)" ], "outputs": [], "execution_count": null, "metadata": { "gather": { "logged": 1642211715022 } } }, { "cell_type": "code", "source": [ "# Functions will be used in this notebook\r\n", "def read_config_values(file_path):\r\n", " \"This loads pre-generated parameters for Microsoft Sentinel Workspace\"\r\n", " with open(file_path) as json_file:\r\n", " if json_file:\r\n", " json_config = json.load(json_file)\r\n", " return (json_config[\"tenant_id\"],\r\n", " json_config[\"subscription_id\"],\r\n", " json_config[\"resource_group\"],\r\n", " json_config[\"workspace_id\"],\r\n", " json_config[\"workspace_name\"],\r\n", " json_config[\"user_alias\"],\r\n", " json_config[\"user_object_id\"])\r\n", " return None\r\n", "\r\n", "def has_valid_token():\r\n", " \"Check to see if there is a valid AAD token\"\r\n", " try:\r\n", " credentials, sub_id = get_azure_cli_credentials()\r\n", " creds = credentials._get_cred(resource=None)\r\n", " token = creds._token_retriever()[2]\r\n", " print(\"Successfully signed in.\")\r\n", " return True\r\n", " except Exception as ex:\r\n", " if \"Please run 'az login' to setup account\" in str(ex):\r\n", " print(\"Please sign in first.\")\r\n", " return False\r\n", " elif \"AADSTS70043: The refresh token has expired\" in str(ex):\r\n", " message = \"**The refresh token has expired. <br> Please continue your login process. Then: <br> 1. If you plan to run multiple notebooks on the same compute instance today, you may restart the compute instance by clicking 'Compute' on left menu, then select the instance, clicking 'Restart'; <br> 2. Otherwise, you may just restart the kernel from top menu. <br> Finally, close and re-load the notebook, then re-run cells one by one from the top.**\"\r\n", " display(Markdown(message))\r\n", " return False\r\n", " elif \"[Errno 2] No such file or directory: '/home/azureuser/.azure/azureProfile.json'\" in str(ex):\r\n", " print(\"Please sign in.\")\r\n", " return False\r\n", " else:\r\n", " print(str(ex))\r\n", " return False\r\n", " except:\r\n", " print(\"Please restart the kernel, and run 'az login'.\")\r\n", " return False" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642211717929 } } }, { "cell_type": "code", "source": [ "# Calling the above function to populate Microsoft Sentinel workspace parameters\r\n", "# The file, config.json, was generated by the system, however, you may modify the values, or manually set the variables\r\n", "tenant_id, subscription_id, resource_group, workspace_id, workspace_name, user_alias, user_object_id = read_config_values('config.json');\r\n", "print(\"Subscription Id: \" + subscription_id)" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642211719915 } } }, { "cell_type": "markdown", "source": [ "## 2. Authentication to Azure Resources" ], "metadata": { "nteract": { "transient": { "deleting": false } } } }, { "cell_type": "code", "source": [ "# Azure CLI is used to get device code to login into Azure, you need to copy the code and open the DeviceLogin site.\r\n", "# You may add [--tenant $tenant_id] to the command\r\n", "if has_valid_token() == False:\r\n", " !echo -e '\\e[42m'\r\n", " !az login --tenant $tenant_id --use-device-code" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642211721898 } } }, { "cell_type": "markdown", "source": [ "## 3. Azure Machine Learning Pipleline" ], "metadata": { "nteract": { "transient": { "deleting": false } } } }, { "cell_type": "code", "source": [ "# 1. Enter resource names\r\n", "# Enter name of an Azure resource group\r\n", "resource_group = 'myresourcegroup'\r\n", "# Enter current AML workspace name\r\n", "current_aml_workspace_name = 'auto2022'\r\n", "# Enter compute cluster name\r\n", "amlcompute_cluster_name = 'compcl2022'\r\n" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642211723878 } } }, { "cell_type": "code", "source": [ "# 2. Get AML workspace\r\n", "ws = Workspace.get(name=current_aml_workspace_name, subscription_id=subscription_id, resource_group=resource_group)\r\n", "print(ws)\r\n", "ws.set_default_datastore(\"workspaceblobstore\")\r\n", "datastore = Datastore.get(ws, \"workspaceblobstore\")" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642211729390 } } }, { "cell_type": "code", "source": [ "# 3. Create a new RunConfig object\r\n", "source_directory = ''\r\n", "notebook_name = 'Automation Gallery - Credential Scan on Azure Blob Storage.ipynb'\r\n", "output_notebook_name = 'blob_scan_results.ipynb'\r\n", "conda_run_config = RunConfiguration(framework=\"python\")\r\n", "conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\r\n", "print('conda-run config is ready')\r\n", "\r\n", "# Create notebook run configuration and set parameters values\r\n", "handler = AzureMLNotebookHandler(timeout=600, progress_bar=False, log_output=True)\r\n", "cfg = NotebookRunConfig(source_directory=source_directory, notebook=notebook_name,\r\n", " handler = handler,\r\n", " parameters={},\r\n", " run_config=conda_run_config,\r\n", " output_notebook=output_notebook_name)\r\n", "\r\n", "print(\"Notebook Run Config is created.\")" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642212053102 } } }, { "cell_type": "code", "source": [ "# 4. Define NotebookRunnerStep\r\n", "#my_pipeline_param = PipelineParameter(name=\"my_pipeline_param\", default_value=datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\"))\r\n", "output_name = \"notebookresult_2022\"\r\n", "\r\n", "output_from_notebook = PipelineData(name=\"notebook_processed_data\", datastore=Datastore.get(ws, \"workspaceblobstore\"),output_overwrite=True, output_mode=\"upload\")\r\n", "notebook_runner_step = NotebookRunnerStep(name=\"sentinel_notebook_step\",\r\n", " notebook_run_config=cfg,\r\n", " params = {},\r\n", " # params={\"my_pipeline_param\": my_pipeline_param},\r\n", " inputs=[],\r\n", " outputs=[], \r\n", " allow_reuse=False,\r\n", " compute_target=amlcompute_cluster_name,\r\n", " output_notebook_pipeline_data_name=output_name)\r\n", "\r\n", "print(\"Notebook Runner Step is Created.\")" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642212057899 } } }, { "cell_type": "code", "source": [ "# 5. Build Pipeline and publish it\r\n", "pipeline4sentinel = Pipeline(workspace=ws, steps=[notebook_runner_step])\r\n", "print(\"Pipeline creation complete\")\r\n", "\r\n", "# Publish the pipeline\r\n", "timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')\r\n", "pipeline_name = \"Sentinel-Pipeline-\" + timenow\r\n", "\r\n", "published_sentinel_pipeline = pipeline4sentinel.publish(\r\n", " name=pipeline_name, \r\n", " description=pipeline_name)\r\n", "print(\"Newly published pipeline id: {}\".format(published_sentinel_pipeline.id))\r\n", "print(\"Endpoint: {}\".format(published_sentinel_pipeline.endpoint))" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642212067884 } } }, { "cell_type": "code", "source": [ "# 6. Create a schedule for the published pipeline using a recurrence\r\n", "schedule_name = 'sentinel_schedule'\r\n", "experiment_name = 'sentinel_experiment_2022'\r\n", "recurrence = ScheduleRecurrence(frequency=\"Day\", interval=1, hours=[22], minutes=[30]) # Runs every other day at 10:30pm\r\n", "#recurrence = ScheduleRecurrence(frequency=\"Hour\", interval=8) # Runs every two hours \r\n", "\r\n", "schedule = Schedule.create(workspace=ws, name=schedule_name,\r\n", " pipeline_id=published_sentinel_pipeline.id, \r\n", " experiment_name=experiment_name,\r\n", " recurrence=recurrence,\r\n", " wait_for_provisioning=True,\r\n", " description=\"Schedule to run Sentinel notebook\")\r\n", "\r\n", "print(\"Created schedule with id: {}\".format(schedule.id))" ], "outputs": [], "execution_count": null, "metadata": { "jupyter": { "source_hidden": false, "outputs_hidden": false }, "nteract": { "transient": { "deleting": false } }, "gather": { "logged": 1642212083888 } } } ], "metadata": { "kernelspec": { "name": "python38-azureml", "language": "python", "display_name": "Python 3.8 - AzureML" }, "language_info": { "name": "python", "version": "3.8.1", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernel_info": { "name": "python38-azureml" }, "microsoft": { "host": { "AzureML": { "notebookHasBeenCompleted": true } } }, "nteract": { "version": "nteract-front-end@1.0.0" } }, "nbformat": 4, "nbformat_minor": 0 }